org 100h

  mov al,13h
  int 10h
  push 0xa000
  pop es
  fninit

;Palette: diffuse + specular = L * clamp([0.25,H,1] + L^8/2)
  mov dx,3c8h
  xor ax,ax
  out dx,al
  inc dx
P:
 ;or bx,0b0000011100011111  ; bx = LLLLL... HHH.....
  or bx,0b0000000011111111  ; bx = LLLLL... HHH.....

  push di    ; b=1
  push bx    ; g=H
  push 0x40  ; r=0.25

  mov al,bh
POW:
  mul al
  mov al,ah
  inc si
  jpo POW    ; 3 times
  xchg ax,cx
  shr cl,1   ; cl=L^8/2 (0..127)

MAD:
  pop ax     ; rgb
  add al,cl  ; al=L^8/2 + rgb
  jnc CLAMP
  salc       ; clamp to 255
CLAMP:
  mul bh     ; ah=L*clamp(L^8/2 + rgb)
  shr ax,10
  out dx,al
  dec si
  jpo MAD    ; 3 times

  inc bx
  jnz P

  ;scasw ; di=0
  xor di,di

;For each frame: compute rotation matrices and comstants
M fld dword[TIME]
  fadd dword[TIME_DELTA]
  fst dword[TIME]
  fsincos          ;| slowC slowS

  fldl2e           ;=1.442695
  fmul dword[TIME]
  fsincos          ;| fastC fastS slowC slowS

  fldz
  fchs
  fld dword[K0_01] ;bx-128 -112 -96 -64   -48   -32   -16   0
  fld dword[K0_25] ;| 0.25 0.01 -0  fastC fastS slowC slowS .

;Store each constant four times (for SSE)
  mov bx,es        ; 0xa000
STORE:
  mov cx,4
STORE4:
  fst dword[bx]
  add bl,4
  loop STORE4

  fstp st0
  jns STORE        ; bx=0xa080

%define K    [bx-128] ; 0.25
%define EPS  [bx-112] ; 0.01
%define NABS [bx-96]  ; -0 = 0x80000000: for -abs()
%define COS  [bx-64]
%define SIN  [bx-48]

;For each pixel:
X:
  mov bx,0xa080
  mov cl,4
X4:
  mov ax,0xcccd
  mul di
  add dx,0x9b80
  mov [bx],ax
  mov [bx+2],dx
  add bx,4
  inc di
  loop X4         ; di+=4 bx=0xa090

%define INTX [bx-17]
%define INTY [bx-16]

%define x xmm0 ; inputs (destroyed)
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define R xmm6 ; translation radius

%define Z xmm7 ; depth

;  sub di,4
;  mov bx,0xa080+16
;Y4:
;  fild dword INTX
;  fmul dword[SCALE]
;  fmul st0
;  
;  fild dword INTY
;  fmul dword[SCALE]
;  fmul st0
;  
;  faddp
;  fsqrt
;  fmul dword[K256]
;  fistp word[bp]
;  mov ax,[bp]
;  stosb
;  add bx,4
;  cmp bx,0xa080+32
;  jne Y4



  movups x,INTX
  cvtdq2ps x,x ; x,y,z = X,Y,Z
  mulps x,[SCALE]
  mulps x,x

  cvtdq2ps y,INTY
  mulps y,[SCALE]
  mulps y,y

  addps x,y
  sqrtps a,x
  
  mulps a,[K256]
  cvtps2dq a,a
  packssdw a,a
  packuswb a,a     ; clamp to a byte 0..255
  movd [es:di-4],a


;    ;Trace a ray for 30 steps.
;      mov cl,30
;      movaps Z,[MINUS1] ; Z=-1
;    T addps Z,a
;      call MAP         ; a = map(X,Y,Z)
;      loop T
;    
;    ;Normal, ambient occlusion
;      movaps o,a
;      addps Z,EPS
;      call MAP         ; a = map(X,Y,Z-EPS)
;      subps a,o        ; a = map(X,Y,Z-EPS) - map(X,Y,Z)
;      divps a,EPS      ; a = (map(X,Y,Z-EPS) - map(X,Y,Z)) / EPS
;    
;    ;Fog
;    ;  minps Z,K1       ; Z = min(Z,1)
;    ;  mulps a,Z        ; a *= Z
;    
;    ;STORE:
;      mulps a,[K256]
;      cvtps2dq a,a
;      packssdw a,a
;      packuswb a,a     ; clamp to a byte 0..255
;    ;  movd eax,a
;    ;  movaps a,o
;    ;  xchg eax,ebp
;    ;  xor bx,32
;    ;  jpo STORE
;    ;  lea eax,[ebp*8+eax]
;    ;  stosd

;  movd [es:di-4],a
;  mov [es:di-4],edi
  test di,di
  jnz X

  in al,0x60
  dec al
  jnz M
  ret

MAP:
  cvtdq2ps x,INTX ; x,y,z = X,Y,Z
  movups y,INTY
  cvtdq2ps y,y
  mulps x,[SCALE]
  mulps y,[SCALE]

  movaps z,Z
;  xorps o,o    ; o=0
  movaps R,K   ; R=K
  mov ch,19    ; do 19 iterations

;Rotate in the xz and yx planes
L movaps b,SIN ; b=S a=C
  movaps a,COS
  mulps b,z    ; b=Sz
  mulps z,a    ; z=Cz
  mulps a,x    ; a=Cx
  mulps x,SIN  ; x=Sx
  addps a,b    ; a=x'=Cx+Sz
  subps z,x    ; z=z'=Cz-Sx

  movaps x,y   ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  xor bx,32
  jpo L

;Reflect x,y
  orps x,NABS  ; x=-abs(x)
  orps y,NABS  ; y=-abs(y)

;Translate, scale translation
  movaps a,R
  mulps a,K    ; a=K*R
  addps x,R    ; x+=R
  addps y,a    ; y+=K*R
  subps R,a    ; R*=1-K

;Squared distance
  movaps a,x
  movaps b,y
  mulps a,x    ; a=x*x
  mulps b,y    ; b=y*y
  addps b,a    ; b=x*x+y*y
  movaps a,z
  mulps a,z    ; a=z*z
  addps b,a    ; b=length^2=x*x+y*y+z*z

;Orbit trap
;  maxps o,b    ; o = max(o,length^2)

;Iterate
  dec ch
  jnz L

;Distance to a sphere
;  sqrtps a,b   ; a=length
  rsqrtps a,b  ; a=(length^2)^(-1/2)
  mulps a,b    ; a=(length^2)^(-1/2 + 1) = length

  subps a,R
  subps a,R    ; a=length-2R: offset distance by 2*radius
  minps a,K    ; a=min(length-2R,K): limit distance
  ret

TIME dd 0
TIME_DELTA dd 0.01
K0_25 dd 0.25
K0_01 dd 0.01

align 16
MINUS1 dd -1.0,-1.0,-1.0,-1.0
SCALE dd 2.3283064e-10,2.3283064e-10,2.3283064e-10,2.3283064e-10  ; 2^-32
K256 dd 256.0,256.0,256.0,256.0
